# Computations
import numpy as np
import pandas as pd
import catboost
# Sklearn
from sklearn import metrics
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze the Liver Disorders Dataset from the UCI Machine Learning Repository.

Picture Source: niddk.nih.gov
The first five variables are all blood tests thought to be sensitive to liver disorders that might arise from excessive alcohol consumption. Each line in the dataset constitutes the record of a single male individual.
Important note: The 7th field (selector) has been widely misinterpreted in the past as a dependent variable representing the presence or absence of a liver disorder. This is incorrect [1]. The 7th field was created by BUPA researchers as a train/test selector. It is not suitable as a dependent variable for classification. The dataset does not contain any variable representing the presence or absence of a liver disorder. Researchers who wish to use this dataset as a classification benchmark should follow the method used in experiments by the donor (Forsyth & Rada, 1986, Machine learning: applications in expert systems and information retrieval) and others (e.g. Turney, 1995, Cost-sensitive classification: Empirical evaluation of a hybrid genetic decision tree induction algorithm), who used the 6th field (drinks), after dichotomizing, as a dependent variable for classification. Because of widespread misinterpretation in the past, researchers should take care to state their method clearly.
| Attribute | Information |
|---|---|
| MCV | Mean corpuscular volume |
| AlkPhos | Alkaline Phosphotase |
| Sgpt | Alamine Aminotransferase |
| Sgot | Aspartate Aminotransferase |
| GammaGT | Gamma-Glutamyl Transpeptidase |
| Drinks | Number of half-pint equivalents of alcoholic beverages drunk per day |
| Selector | Field used to split data into two sets |
Data = np.genfromtxt('liver-disorders/bupa.data', delimiter=',', dtype = int)
Attributes = ['MCV', 'AlkPhos', 'Sgpt', 'Sgot', 'GammaGT', 'Drinks', 'Selector']
Data = pd.DataFrame(data = Data, columns = Attributes)
Target = 'Drinks'
display(Data.head())
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
data_info = Data_Plot(Data, Title = 'Liver Disorders Dataset', W = 500)
| MCV | AlkPhos | Sgpt | Sgot | GammaGT | Drinks | Selector | |
|---|---|---|---|---|---|---|---|
| 0 | 85 | 92 | 45 | 27 | 31 | 0 | 1 |
| 1 | 85 | 64 | 59 | 32 | 23 | 0 | 2 |
| 2 | 86 | 54 | 33 | 16 | 54 | 0 | 2 |
| 3 | 91 | 78 | 34 | 24 | 36 | 0 | 2 |
| 4 | 87 | 70 | 12 | 28 | 10 | 0 | 2 |
In this article, the dependent variable is the number of drinks. Note that, Selector column is intended to split the data into train and test subsets for one particular experiment.
Moreover, high variance for some features can hurt our modeling process. For this reason, we can standardize features by removing the mean and scaling to unit variance.
def Feature_Normalize(X, PD):
def List_Break(mylist, n = PD['word_break']):
Out = []
for x in mylist:
y = x.split()
if len(y)> n:
z = ' '.join(y[:n])
sep = np.arange(0, len(y), n)[1:]
for n in sep:
z = z + '\n'+ ' '.join(y[n:])
else:
z = ' '.join(y)
Out.append(z)
return Out
X = Data.drop(columns = [Target])
y = Data[Target]
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns = X.columns)
fig, ax = plt.subplots(2, 1, figsize = PD['figsize'])
ax = ax.ravel()
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", X.shape[1])]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": PD['annot_text_size']},
cbar_kws=kws)
if not PD['word_break'] == None:
mylist = List_Break(Temp.T.index.tolist())
_ = ax[i].xaxis.set_ticklabels(mylist)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], weight='bold', fontsize = 14)
_ = ax[i].set_aspect(1)
del Temp
plt.subplots_adjust(hspace=PD['hspace'])
Out = pd.DataFrame(data = X_std, columns = X.columns.tolist())
return Out
X = Data.drop(columns = [Target])
y = Data[Target]
PD = dict(figsize = (8, 7), hspace = 0.2, annot_text_size = 12, word_break = 2)
X = Feature_Normalize(X, PD)
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
def DataSize(Inp):
return pd.DataFrame({'Number of Instances': [Inp.shape[0]], 'Number of Attributes': [Inp.shape[1]]}).style.hide_index()
# Train Set
Header(Text = 'Train')
Train = Data.loc[Data.Selector == 2].drop(columns = 'Selector').reset_index(drop = True)
display(DataSize(Train))
X_train = Train.copy()
y_train = X_train.pop(Target)
# Test Set
Header(Text = 'Test', C = 'Green')
Test = Data.loc[Data.Selector == 1].drop(columns = 'Selector').reset_index(drop = True)
display(DataSize(Test))
X_test = Test.copy()
y_test = X_test.pop(Target)
Line()
display(pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T)
Train ==============================================================================================
| Number of Instances | Number of Attributes |
|---|---|
| 200 | 6 |
Test ===============================================================================================
| Number of Instances | Number of Attributes |
|---|---|
| 145 | 6 |
====================================================================================================
| Set | X_train | X_test | y_train | y_test |
|---|---|---|---|---|
| Shape | (200, 5) | (145, 5) | (200,) | (145,) |
CatBoost AI is based on gradient boosted decision trees. During training, a set of decision trees is built consecutively. Each successive tree is built with reduced loss compared to the previous trees.
N = int(1e5)
model = catboost.CatBoostRegressor(iterations= N, task_type="GPU", devices='0:1', max_ctr_complexity=5,
random_seed= 0, od_type='Iter', od_wait=N, verbose=int(N/10), depth=5)
_ = model.fit(X_train, y_train, eval_set=(X_test, y_test))
# clear_output()
Learning rate set to 0.008242 0: learn: 2.9761837 test: 4.0603519 best: 4.0603519 (0) total: 12.2ms remaining: 20m 15s 10000: learn: 1.8352110 test: 3.6248574 best: 3.6123507 (564) total: 1m 51s remaining: 16m 47s 20000: learn: 1.8255787 test: 3.6281617 best: 3.6123507 (564) total: 3m 38s remaining: 14m 34s 30000: learn: 1.8254004 test: 3.6273392 best: 3.6123507 (564) total: 5m 26s remaining: 12m 40s 40000: learn: 1.8253809 test: 3.6284896 best: 3.6123507 (564) total: 7m 13s remaining: 10m 49s 50000: learn: 1.8159583 test: 3.6486311 best: 3.6123507 (564) total: 8m 58s remaining: 8m 58s 60000: learn: 1.8159200 test: 3.6480488 best: 3.6123507 (564) total: 10m 48s remaining: 7m 12s 70000: learn: 1.7232629 test: 3.5970065 best: 3.5964915 (68758) total: 12m 34s remaining: 5m 23s 80000: learn: 1.5202133 test: 3.5867399 best: 3.5736086 (75186) total: 14m 19s remaining: 3m 34s 90000: learn: 1.5199647 test: 3.5862163 best: 3.5736086 (75186) total: 15m 47s remaining: 1m 45s 99999: learn: 1.5175128 test: 3.5835455 best: 3.5736086 (75186) total: 17m 13s remaining: 0us bestTest = 3.57360864 bestIteration = 75186 Shrink model to first 75187 iterations.
feat_importance = pd.DataFrame(list(zip(X_train.dtypes.index,
model.get_feature_importance(catboost.Pool(X_train, label=y_train)))),
columns=['Features','Weight']).sort_values('Weight', ascending = False)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 4))
_ = sns.barplot(y="Features", x="Weight", palette="PuRd", edgecolor = 'RoyalBlue', hatch = '///',
data = feat_importance, ax = ax)
_ = sns.barplot(y="Features", x="Weight", facecolor = 'None', edgecolor = 'Black', data = feat_importance, ax = ax)
_ = ax.set_xlim([0, 25])
_ = ax.set_title('Feature Importance')
model.plot_tree(tree_idx=0, pool= catboost.Pool(X_train, label=y_train))
The best result for each metric calculated on each validation dataset.
display(pd.DataFrame(model.get_best_score()))
| learn | validation | |
|---|---|---|
| RMSE | 1.517513 | 3.573609 |
R2 Score
display(pd.DataFrame({'Train Set': {'R2 Score': model.score(X_train, y_train)},
'Validation Set': {'R2 Score': model.score(X_test, y_test)}}))
| Train Set | Validation Set | |
|---|---|---|
| R2 Score | 0.716551 | 0.226301 |